
library(dplyr)
library(haven)
library(readr)
library(assertr)

setwd('./NC Data')
rawdata_path = 'raw_data_from_ncerdc'

## Read raw data for years 2007 to 2014
MB = setNames(vector(mode="list", length=8), 2007:2014)

for (yr in 2007:2014){
  
  cat('Converting .dta file for year',yr,'\n')

  if (yr %in% 2007:2012){
    dta_file = sprintf('mb_%d_pub.dta',yr)
  } else if (yr %in% 2013:2014){
    dta_file = sprintf('pcaudit_pub%d.dta',yr)
  }
  
  D = read_dta(file.path(rawdata_path,dta_file),encoding="latin1") %>%
    rename(any_of(c(lep = 'lep_current', ethnic = 'ethnicity', 
                    ma_score = 'ma_scoreN', rd_score = 'rd_scoreN', 
                    ma_score = 'pc_ma_score', rd_score = 'pc_rd_score'))) %>%
    bind_rows(tibble(eds = character())) %>%
    select(mastid,grade,ethnic,sex,lep,eds,ma_score,rd_score) %>%
    mutate(across(c("mastid","grade","ma_score","rd_score"),as.numeric),
           across(c("ethnic","sex","lep","eds"),as.character),
           year = yr) %>%
    filter(!is.na(mastid))

  MB[[paste(yr)]] = D 
}  
MB = bind_rows(MB)

## Functions
max_na = function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE))
min_na = function(x) ifelse(all(is.na(x)), NA, min(x, na.rm = TRUE))
mode_or_last = function(x){
  y = DescTools::Mode(x, na.rm = TRUE)
  if (is.na(y[1])){
    y = last(x[!is.na(x)])
  } else if (length(y)>1){
    y = last(x[x %in% y])
  } else {
    y = y[1]
  }
  return(y)
}

## Code race and sex using all available data
##  Assign modal value or last value across all years
demog = MB %>%
  mutate(race = case_when(ethnic %in% c('BLCK','B','4') ~ 'BLCK',
                          ethnic %in% c('HISP','H','3') ~ 'HISP',
                          ethnic %in% c('AMIN','I','1') ~ 'AMIN',
                          ethnic %in% c('ASIA','A','2') ~ 'ASIA',
                          ethnic %in% c('MULT','M','6') ~ 'MULT',
                          ethnic %in% c('WHTE','W','5') ~ 'WHTE',
                          ethnic=='P' ~ 'P',
                          TRUE ~ NA)) %>%
  verify(!is.na(race), description = 'race value is missing') %>%
  verify(!is.na(sex), description = 'sex value is missing') %>%
  arrange(mastid,year) %>%
  group_by(mastid) %>%
  summarize(sex = mode_or_last(sex),
            race = mode_or_last(race),.groups='drop') %>%
  select(mastid,sex,race)
            
## Score data for grades 3 to 8
##  only allow duplicated if same grade in year
##  use min of scores and max of LEP and FRL

scr = MB %>%
  select(-c(sex,ethnic)) %>%
  mutate(lep = case_when(lep=='Y' ~ 1,
                         lep %in% c('N','1','U') ~ 0,
                         TRUE ~ NA),
         poverty = case_when(eds=='Y' ~ 1,
                             eds=='N' ~ 0,
                             TRUE ~ NA)) %>%
  verify(!is.na(lep), description = 'lep value is missing') %>%
  verify(!is.na(poverty) | (is.na(poverty) & year==2007), description = 'poverty value is missing') %>%
  group_by(mastid,year) %>%
  filter(!any(is.na(grade)) & min(grade)==max(grade) & grade %in% (3:8)) %>%
  summarize(grade = first(grade),
            poverty = max_na(poverty),
            lep = max_na(lep),
            ma_score = min_na(ma_score),
            rd_score = min_na(rd_score), .groups = 'drop')

data = scr %>%
  left_join(demog, by = c('mastid')) %>%
  mutate(ma_min = case_when(year==2007 & grade==3 ~ 310,
                            year==2007 & grade %in% (4:8) ~ 320,
                            year %in% (2008:2012) & grade==3 ~ 310,
                            year %in% (2008:2012) & grade==4 ~ 315,
                            year %in% (2008:2012) & grade %in% (5:8) ~ 325,
                            year %in% (2013:2014) & grade==3 ~ 420,
                            year %in% (2013:2014) & grade %in% (4:8) ~ 424,
                            TRUE ~ NA),
         rd_min = case_when(year==2007 & grade==3 ~ 215,
                            year==2007 & grade %in% (4:8) ~ 220,
                            year %in% (2008:2012) & grade==3 ~ 300,
                            year %in% (2008:2012) & grade==4 ~ 310,
                            year %in% (2008:2012) & grade %in% (5:8) ~ 319,
                            year %in% (2013:2014) & grade==3 ~ 405,
                            year %in% (2013:2014) & grade==4 ~ 410,
                            year %in% (2013:2014) & grade %in% (4:8) ~ 418,
                            TRUE ~ NA)) %>%
  verify(!is.na(ma_min), description = 'minimum math score not assigned') %>%
  verify(!is.na(rd_min), description = 'minimum reading score not assigned') %>%
  mutate(ma_score = replace(ma_score, ma_score<ma_min, NA),
         rd_score = replace(rd_score, rd_score<rd_min, NA)) %>%
  group_by(year,grade) %>%
  mutate(ma_score = scale(ma_score)[,1],
         rd_score = scale(rd_score)[,1]) %>%
  arrange(mastid,year) %>%
  group_by(mastid) %>%
  mutate(lag_year = lag(year),
         lag_grade = lag(grade),
         lag_ma_score = lag(ma_score),
         lag_rd_score = lag(rd_score),
         promo = ((year - lag_year)==1) & ((grade - lag_grade)==1),
         promo = replace(promo,is.na(promo),FALSE),
         lag_ma_score = replace(lag_ma_score, !promo, NA),
         lag_rd_score = replace(lag_rd_score, !promo, NA)) %>%
  ungroup() %>%
  filter(year>=2008 & grade>=4) %>%
  select(mastid,year,grade,sex,race,poverty,lep,ma_score,rd_score,lag_ma_score,lag_rd_score)
         
write_rds(data,'student_scores.rds')

